TF-IDF Analysis of Haiku

Make a single list of TF-IDF frequent words


In [1]:
from __future__ import division
from IPython.display import display, HTML
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
#%qtconsole

Create the corpus


In [3]:
corpus_dir                 = 'corpus_haiku'            # input directory
corpus_statistics_filename = 'haiku_multi-grams.txt'   # output file name

# output-file header
text_msg = "This file contains TF-IDF-frequent 3-6-gram words in Dr. Fisher's haiku\n\n"


477 documents read

In [ ]:
f = open(corpus_statistics_filename,'w')
f.write(text_msg)
f.close()

corpus = []
titles = []
import os
for num_docs,item in enumerate(os.listdir(corpus_dir)):
    if item.endswith(".txt"):
        file_name = item
        titles.append(file_name)
        f = open(corpus_dir + '/' + file_name,'r')
        data = f.read()
        corpus.append(data)
        
print("{:,} documents read".format(num_docs+1))

Extract the TF-IDF-frequent n-grams


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

set_of_words = set()

for max_g in [3,4,5,6]:
    for max_df in np.arange(0.2,1.0,0.1):

        tf = TfidfVectorizer(analyzer=u'word', 
                             stop_words='english', 
                             ngram_range=(1, max_g),    # n-grams
                             max_df=max_df, 
                             min_df=1, 
                             vocabulary=None, 
                             norm='l2', 
                             use_idf=True, 
                             smooth_idf=True, 
                             sublinear_tf=False)
        tfidf_matrix =  tf.fit_transform(corpus)
        feature_names = tf.get_feature_names() 

        dense = tfidf_matrix.todense()

        for title_idx,title in enumerate(titles):

            doc_10best_features = [i[0] for i in sorted(enumerate(dense.tolist()[title_idx]), 
                                                        key=lambda x:x[1], 
                                                        reverse=True)][:10]
            for idx in doc_10best_features:
                set_of_words.add(feature_names[idx])

Write out the set of n-grams to the dataset


In [5]:
f = open(corpus_statistics_filename,'a')
for i,word in enumerate(sorted(list(set_of_words))):
    f.write(word)
    f.write("\n")
f.close()